library(ggplot2) # graphics library
library(MASS)    # contains data sets, including Boston
library(ISLR)    # contains code and data from the textbook
library(knitr)   # contains kable() function
library(gridExtra) #Miscellaneous Functions for "Grid" Graphics
library(dplyr)  # contains useful functions for data manipulation
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(jtools) # Analysis and Presentation of Social Scientific Data
library(magrittr) # A Forward-Pipe Operator for R: %>%
library(kableExtra) # For constructing complex tables
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(ggpubr) # 'ggplot2' Based Publication Ready Plots
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(leaps)  # needed for regsubsets
library(boot)   # needed for cv.glm
library(gridExtra)
library(ggcorrplot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(gbm)
## Loaded gbm 2.1.8
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(rpart)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
options(scipen = 4)  # Suppresses scientific notation

Introduction

In this project, we intend to address the issues of flight delays to improve customer satisfaction especially for individuals (also potential customers) who receive travelers at the arrival airport. These uncertainties create a lot of dissatisfaction for passengers as well as those receiving them at the arrival airport and hence damage the overall reputation of an airline. For this analysis, we used 2006 dataset where we used 46,666 rows (with arrival destination, PIT International Airport only) to train our model. Later, we use 20% of the 2018-2019 dataset to test our model.

We believe that 2006 dataset would be an appropriate fit for our test model which would then help us understand how the flight delays have evolved over the years as well move forward to analyze changes between 2006 and 2018-19 dataset.

1. Exploratory analysis

all_PIT_2006 <- read.csv(file = "all_PIT_2006.csv", header = TRUE)
head(all_PIT_2006)
##   Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006       1     1     20355            US      US  1/11/2006         11
## 2 2006       1     1     20355            US      US  1/11/2006         11
## 3 2006       1     1     20355            US      US  1/11/2006         11
## 4 2006       1     1     20355            US      US  1/11/2006         11
## 5 2006       1     1     20355            US      US  1/11/2006         11
## 6 2006       1     1     20355            US      US  1/11/2006         11
##   DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1         3       1      1607  N813MA                96             90      77
## 2         3       1       135  N114UW                98            109      81
## 3         3       1       139  N747UW               108            111      83
## 4         3       1       559  N433US               111            105      85
## 5         3       1       190  N745UW                73             85      59
## 6         3       1       258  N420US                74             81      57
##   ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1        0        0           0           0        1    1651  1600-1659
## 2        0        0           0           0       -7    1722  1700-1759
## 3        1        0           1           0       16    2012  1900-1959
## 4        0        0           0           0        0     750  0700-0759
## 5        0        0           0           0      -13    2242  2200-2259
## 6        0        0           0           0       -4     857  0900-0959
##   CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1       1650        0        0           0           0       -5    1515
## 2       1729        0        0           0           0        4    1544
## 3       1956        1        0           1           0       19    1824
## 4        750        0        0           0           0       -6     559
## 5       2255        0        0           0           0       -1    2129
## 6        901        0        0           0           0        3     743
##   DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1  1500-1559       1520    BDL       Hartford          CT               9
## 2  1500-1559       1540    BOS         Boston          MA              25
## 3  1800-1859       1805    BOS         Boston          MA              25
## 4  0600-0659        605    BOS         Boston          MA              25
## 5  2100-2159       2130    CLT      Charlotte          NC              37
## 6  0700-0759        740    CLT      Charlotte          NC              37
##   OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1     Connecticut        11  PIT   Pittsburgh        PA            42
## 2   Massachusetts        13  PIT   Pittsburgh        PA            42
## 3   Massachusetts        13  PIT   Pittsburgh        PA            42
## 4   Massachusetts        13  PIT   Pittsburgh        PA            42
## 5  North Carolina        36  PIT   Pittsburgh        PA            42
## 6  North Carolina        36  PIT   Pittsburgh        PA            42
##   DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1  Pennsylvania      23      406             2      5      14      1529
## 2  Pennsylvania      23      496             2      3      14      1558
## 3  Pennsylvania      23      496             2      5      20      1844
## 4  Pennsylvania      23      496             2      5      21       620
## 5  Pennsylvania      23      366             2      4      10      2139
## 6  Pennsylvania      23      366             2      5      12       755
##   WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1     1646         0                         0            0            0
## 2     1719         0                         0            0            0
## 3     2007         0                         0            0            0
## 4      745         0                         0            0            0
## 5     2238         0                         0            0            0
## 6      852         0                         0            0            0
##   NASDelay SecurityDelay LateAircraftDelay
## 1        0             0                 0
## 2        0             0                 0
## 3        0             0                16
## 4        0             0                 0
## 5        0             0                 0
## 6        0             0                 0
all_PIT_201803_201902 <- read.csv(file = "all_PIT_201803_201902.csv", header = TRUE)
head(all_PIT_201803_201902)
##   Year Quarter Month DayofMonth DayOfWeek FlightDate Reporting_Airline
## 1 2018       1     3          1         4   3/1/2018                9E
## 2 2018       1     3          1         4   3/1/2018                9E
## 3 2018       1     3          1         4   3/1/2018                9E
## 4 2018       1     3          1         4   3/1/2018                9E
## 5 2018       1     3          1         4   3/1/2018                9E
## 6 2018       1     3          1         4   3/1/2018                9E
##   DOT_ID_Reporting_Airline IATA_CODE_Reporting_Airline Tail_Number
## 1                    20363                          9E      N902XJ
## 2                    20363                          9E      N272PQ
## 3                    20363                          9E      N309PQ
## 4                    20363                          9E      N293PQ
## 5                    20363                          9E      N272PQ
## 6                    20363                          9E      N200PQ
##   Flight_Number_Reporting_Airline OriginAirportID OriginAirportSeqID
## 1                            3359           14122            1412202
## 2                            3427           11433            1143302
## 3                            3464           14122            1412202
## 4                            3490           14122            1412202
## 5                            3499           14122            1412202
## 6                            3501           14122            1412202
##   OriginCityMarketID Origin OriginCityName OriginState OriginStateFips
## 1              30198    PIT Pittsburgh, PA          PA              42
## 2              31295    DTW    Detroit, MI          MI              26
## 3              30198    PIT Pittsburgh, PA          PA              42
## 4              30198    PIT Pittsburgh, PA          PA              42
## 5              30198    PIT Pittsburgh, PA          PA              42
## 6              30198    PIT Pittsburgh, PA          PA              42
##   OriginStateName OriginWac DestAirportID DestAirportSeqID DestCityMarketID
## 1    Pennsylvania        23         13487          1348702            31650
## 2        Michigan        43         14122          1412202            30198
## 3    Pennsylvania        23         12478          1247805            31703
## 4    Pennsylvania        23         11433          1143302            31295
## 5    Pennsylvania        23         11433          1143302            31295
## 6    Pennsylvania        23         11433          1143302            31295
##   Dest    DestCityName DestState DestStateFips DestStateName DestWac CRSDepTime
## 1  MSP Minneapolis, MN        MN            27     Minnesota      63       1730
## 2  PIT  Pittsburgh, PA        PA            42  Pennsylvania      23       1530
## 3  JFK    New York, NY        NY            36      New York      22        600
## 4  DTW     Detroit, MI        MI            26      Michigan      43       1300
## 5  DTW     Detroit, MI        MI            26      Michigan      43       1736
## 6  DTW     Detroit, MI        MI            26      Michigan      43        956
##   DepTime DepDelay DepDelayMinutes DepDel15 DepartureDelayGroups DepTimeBlk
## 1    1730       NA              NA       NA                   NA  1700-1759
## 2    1536        6               6        0                    0  1500-1559
## 3     600       NA              NA       NA                   NA  0600-0659
## 4    1253       -7               0        0                   -1  1300-1359
## 5    2023      167             167        1                   11  1700-1759
## 6     953       -3               0        0                   -1  0900-0959
##   TaxiOut WheelsOff WheelsOn TaxiIn CRSArrTime ArrTime ArrDelay ArrDelayMinutes
## 1      17      1747     1838      3       1856    1841      -15               0
## 2      84      1700     1736      7       1645    1743       58              58
## 3      13       613      711      5        750     716      -34               0
## 4      17      1310     1358     21       1411    1419        8               8
## 5      23      2046     2129     20       1859    2149      170             170
## 6      19      1012     1101      6       1119    1107      -12               0
##   ArrDel15 ArrivalDelayGroups ArrTimeBlk Cancelled CancellationCode Diverted
## 1        0                 -1  1800-1859         0             <NA>        0
## 2        1                  3  1600-1659         0             <NA>        0
## 3        0                 -2  0700-0759         0             <NA>        0
## 4        0                  0  1400-1459         0             <NA>        0
## 5        1                 11  1800-1859         0             <NA>        0
## 6        0                 -1  1100-1159         0             <NA>        0
##   CRSElapsedTime ActualElapsedTime AirTime Flights Distance DistanceGroup
## 1            146               131     111       1      726             3
## 2             75               127      36       1      201             1
## 3            110                76      58       1      340             2
## 4             71                86      48       1      201             1
## 5             83                86      43       1      201             1
## 6             83                74      49       1      201             1
##   CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
## 1           NA           NA       NA            NA                NA
## 2            0            3       52             0                 3
## 3           NA           NA       NA            NA                NA
## 4           NA           NA       NA            NA                NA
## 5            0            0      128             0                42
## 6           NA           NA       NA            NA                NA
##   FirstDepTime TotalAddGTime LongestAddGTime
## 1           NA            NA              NA
## 2           NA            NA              NA
## 3           NA            NA              NA
## 4           NA            NA              NA
## 5           NA            NA              NA
## 6           NA            NA              NA

First, we begin by cleaning our dataset. For NA values, we decided to first run a count of total NA values and after assessing its overall proportion, we dropped them from our dataset. This was to avoid any problems in our analysis and model training. We also assess the class of each of each variable within all_PIT_2006_clean dataset to understand the nature/state of each variable before we run any analysis.

1.1 Cleaning Dataset

totalNA <- sum(is.na(all_PIT_2006))
all_PIT_2006_clean <- na.omit(all_PIT_2006) # Remove NA rows
# sum(is.na(all_PIT_2006_clean)) # To check no NA values remain

head(all_PIT_2006_clean)
##   Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006       1     1     20355            US      US  1/11/2006         11
## 2 2006       1     1     20355            US      US  1/11/2006         11
## 3 2006       1     1     20355            US      US  1/11/2006         11
## 4 2006       1     1     20355            US      US  1/11/2006         11
## 5 2006       1     1     20355            US      US  1/11/2006         11
## 6 2006       1     1     20355            US      US  1/11/2006         11
##   DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1         3       1      1607  N813MA                96             90      77
## 2         3       1       135  N114UW                98            109      81
## 3         3       1       139  N747UW               108            111      83
## 4         3       1       559  N433US               111            105      85
## 5         3       1       190  N745UW                73             85      59
## 6         3       1       258  N420US                74             81      57
##   ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1        0        0           0           0        1    1651  1600-1659
## 2        0        0           0           0       -7    1722  1700-1759
## 3        1        0           1           0       16    2012  1900-1959
## 4        0        0           0           0        0     750  0700-0759
## 5        0        0           0           0      -13    2242  2200-2259
## 6        0        0           0           0       -4     857  0900-0959
##   CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1       1650        0        0           0           0       -5    1515
## 2       1729        0        0           0           0        4    1544
## 3       1956        1        0           1           0       19    1824
## 4        750        0        0           0           0       -6     559
## 5       2255        0        0           0           0       -1    2129
## 6        901        0        0           0           0        3     743
##   DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1  1500-1559       1520    BDL       Hartford          CT               9
## 2  1500-1559       1540    BOS         Boston          MA              25
## 3  1800-1859       1805    BOS         Boston          MA              25
## 4  0600-0659        605    BOS         Boston          MA              25
## 5  2100-2159       2130    CLT      Charlotte          NC              37
## 6  0700-0759        740    CLT      Charlotte          NC              37
##   OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1     Connecticut        11  PIT   Pittsburgh        PA            42
## 2   Massachusetts        13  PIT   Pittsburgh        PA            42
## 3   Massachusetts        13  PIT   Pittsburgh        PA            42
## 4   Massachusetts        13  PIT   Pittsburgh        PA            42
## 5  North Carolina        36  PIT   Pittsburgh        PA            42
## 6  North Carolina        36  PIT   Pittsburgh        PA            42
##   DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1  Pennsylvania      23      406             2      5      14      1529
## 2  Pennsylvania      23      496             2      3      14      1558
## 3  Pennsylvania      23      496             2      5      20      1844
## 4  Pennsylvania      23      496             2      5      21       620
## 5  Pennsylvania      23      366             2      4      10      2139
## 6  Pennsylvania      23      366             2      5      12       755
##   WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1     1646         0                         0            0            0
## 2     1719         0                         0            0            0
## 3     2007         0                         0            0            0
## 4      745         0                         0            0            0
## 5     2238         0                         0            0            0
## 6      852         0                         0            0            0
##   NASDelay SecurityDelay LateAircraftDelay
## 1        0             0                 0
## 2        0             0                 0
## 3        0             0                16
## 4        0             0                 0
## 5        0             0                 0
## 6        0             0                 0
quarterVals <- unique(all_PIT_2006_clean$Quarter) # 1,2,3,4. To check if there were any values that should not have been there like 0 or 5,6,7..
monthVals <- unique(all_PIT_2006_clean$Month) # Only values from 1 to 12
DayOfMonth <- unique(all_PIT_2006_clean$DayofMonth) # Only values from 1 to 31
DayOfWeek <- unique(all_PIT_2006_clean$DayOfWeek) # Only values from 1 to 7


classes.PIT_06 <- data.frame(variabletype = sapply(all_PIT_2006_clean,class))
classes.PIT_06
##                   variabletype
## Year                   integer
## Quarter                integer
## Month                  integer
## AirlineID              integer
## UniqueCarrier        character
## Carrier              character
## FlightDate           character
## DayofMonth             integer
## DayOfWeek              integer
## Flights                integer
## FlightNum              integer
## TailNum              character
## ActualElapsedTime      integer
## CRSElapsedTime         integer
## AirTime                integer
## ArrDel15               integer
## ArrDel30               integer
## ArrDelSys15            integer
## ArrDelSys30            integer
## ArrDelay               integer
## ArrTime                integer
## ArrTimeBlk           character
## CRSArrTime             integer
## DepDel15               integer
## DepDel30               integer
## DepDelSys15            integer
## DepDelSys30            integer
## DepDelay               integer
## DepTime                integer
## DepTimeBlk           character
## CRSDepTime             integer
## Origin               character
## OriginCityName       character
## OriginState          character
## OriginStateFips        integer
## OriginStateName      character
## OriginWac              integer
## Dest                 character
## DestCityName         character
## DestState            character
## DestStateFips          integer
## DestStateName        character
## DestWac                integer
## Distance               integer
## DistanceGroup          integer
## TaxiIn                 integer
## TaxiOut                integer
## WheelsOff              integer
## WheelsOn               integer
## Cancelled              integer
## CancellationCode     character
## Diverted               integer
## CarrierDelay           integer
## WeatherDelay           integer
## NASDelay               integer
## SecurityDelay          integer
## LateAircraftDelay      integer

1.2 Data Insights

1a. Carrier flight frequency: We can see from our plot that US carrier is the most frequently appearing carrier in our dataset followed by WN. This is helpful as we move forward with our analysis as we find patterns of flight delays based on carrier and other parameters. As we account for CarrierDelay, we se that US followed by MQ and then WN show the highest number of Carrier Delays.

1b. Calculate proportions via normalization: It is also important to understand that the number of flights vary for each carrier, hence, we should actually compare the proportion of total flights that are delayed instead of their exact numbers. We normalize our data and plot proportions accordingly.As we normalize and account for proportions, EV followed by UA give us the highest number of Carrier delays in our dataset.

ggplot(data = all_PIT_2006_clean,
       aes(x = Carrier)) +
  geom_histogram(stat = "count") # To check each carrier's flights
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# Normalize data and plot proportions

delaysPerCarrier <- all_PIT_2006_clean %>%
  group_by(Carrier) %>%
  summarize(num.obs = n(),
            Sum.of.CarDelay = sum(CarrierDelay > 0))

ggplot(data = delaysPerCarrier,
       aes(x = Carrier,
           y = Sum.of.CarDelay)) + geom_bar(stat = "identity")

# But some Carriers might have fewer flights so we should actually compare the proportion of total flights that got delayed

delaysPerCarrier <- all_PIT_2006_clean %>%
  group_by(Carrier) %>%
  summarize(num.obs = n(),
            Prop.CarDelay = sum(CarrierDelay > 0) / n())

ggplot(data = delaysPerCarrier,
       aes(x = Carrier,
           y = Prop.CarDelay)) +
  geom_bar(stat = "identity")

  1. Origin/Destination City: Each airport and city has its own dynamics which can greatly impact flight schedules and delays subsequently. Working with this assumption, we run analysis to see whether delays were related with the Origin City. Similarly, the destination airports could affects flight delays too such as no clear runway to land due to a high frequency of flights,etc. From our analysis, we see that Seattle followed by San Juan are the Origin cities that experience the highest rate of delays while Cincinnati and San Juan experience the highest number of delays when they are the destination airport.
# Checking if delays had to do anything with the Origin City 

OriginCity.CarrierDelay <- all_PIT_2006_clean %>%
  group_by(OriginCityName, Carrier) %>%
  summarize(total = n(),
            delayedFlightNum = sum(CarrierDelay > 0),
            delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'OriginCityName'. You can override using the `.groups` argument.
OriginCity.CarrierDelay %>%
  group_by(OriginCityName) %>%
  summarize(totFlightsInCity = sum(total),
            totDelayedFlights = sum(delayedFlightNum),
            proportion = totDelayedFlights / totFlightsInCity) %>%
  ggplot(aes(x = OriginCityName,
             y = proportion)) +
  geom_histogram(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5), legend.position = 'none')
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#check if delays had to do anything with the Destination City 
DestCity.CarrierDelay <- all_PIT_2006_clean %>%
  group_by(DestCityName, Carrier) %>%
  summarize(total = n(),
            delayedFlightNum = sum(CarrierDelay > 0),
            delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'DestCityName'. You can override using the `.groups` argument.
DestCity.CarrierDelay %>%
  group_by(DestCityName) %>%
  summarize(totFlightsInCity = sum(total),
            totDelayedFlights = sum(delayedFlightNum),
            proportion = totDelayedFlights / totFlightsInCity) %>%
  ggplot(aes(x = DestCityName,
             y = proportion)) +
  geom_histogram(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5), legend.position = 'none')
## Warning: Ignoring unknown parameters: binwidth, bins, pad

  1. Carrier Delays and Month: We set out to explore monthly affects on Carrier delays too as several cyclical changes such as peak season could possibly delay flights as the system becomes overwhelmed. We see that the month of June and July see the highest proportion of Carrier delays as compared to other months. However, we have to yet see if this is statistically significant.
# Checking if delays (Carrier) had to do anything with the Month
OriginCity.MonthDelay <- all_PIT_2006_clean %>%
  group_by(Month, Carrier) %>%
  summarize(total = n(),
            delayedFlightNum = sum(CarrierDelay > 0),
            delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.
OriginCity.MonthDelay %>%
  group_by(Month) %>%
  summarize(totalFlightsInMonth = sum(total),
            totalDelaysInMonth = sum(delayedFlightNum),
            propInMonth = totalDelaysInMonth / totalFlightsInMonth) %>%
  ggplot(aes(x = as.factor(Month),
             y = propInMonth)) +
  geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

  1. Weather Delays and Month: Weather patterns vary by month and pose different challenges to airline industry. Bad weather can be problematic and can easily cause disruptions in flight schedules. We set out to explore how weather delays have different effects for different months and if any at all. We see that the month of June and July see the highest proportion of Weather delays as compared to other months. However, we have to yet see if this is statistically significant.
# Checking if delays (weather) had to do anything with the Month
WeatherDelay <- all_PIT_2006_clean %>%
  group_by(Month) %>%
  summarize(total = n(),
            delayedFlightNum = sum(WeatherDelay > 0),
            delayedFlightProp = sum(WeatherDelay > 0) / n())

WeatherDelay %>%
  group_by(Month) %>%
  summarize(totalFlightsInMonth = sum(total),
            totalDelaysInMonth = sum(delayedFlightNum),
            propInMonth = totalDelaysInMonth / totalFlightsInMonth) %>%
  ggplot(aes(x = as.factor(Month),
             y = propInMonth)) +
  geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

  1. Variable selection: We pick ArrDel15 as our response (dependent) variable as we work with the assumption that 15 minutes slot is a reasonable interval for to qualify for a flight arrival delay. We use this as a factor variable, i.e. it equals to 1 if the delay is of 15 minutes or more, and 0 if the delay is less than 15 minutes.
  1. Month: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.
  2. Distance: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.
  3. TaxiIn: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.
  4. DepDelay: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes, DepDelay is higher too which does make intuitive sense.
  5. WheelsOff: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes, WheelsOff is at a later time which makes sense given the later the wheels are lifted from destination airport, the greater the arrival delay.
  6. TaxiOut: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes,TaxiOut is late which makes sense given the longer an airplane takes to taxi out at the destination airport, the greater the arrival delay.
  7. Quarter: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.
# Potential variable analysis

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Month)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "Month")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Distance)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "Distance")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = TaxiIn)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "TaxiIn")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = DepDelay)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "DepDelay")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = WheelsOff)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "WheelsOff")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = TaxiOut)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "TaxiOut")

ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Quarter)) +
  geom_boxplot() + labs(x = "ArrDel15", y = "Quarter")

  1. Exploring Few More Relationships
  1. The first plot shows the proportion of delays of each type out of the total number of flights that got delayed (by greater than 0 minutes, across different carrier etc.). We see NW has the highest proportion followed by UA and then EV.
  2. We also explore the relationship of WheelsOn with ArrDelay and as we saw earlier with ArrDel15, the later the Wheels are “on”, the greater the arrival delay.
head(all_PIT_2006_clean)
##   Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006       1     1     20355            US      US  1/11/2006         11
## 2 2006       1     1     20355            US      US  1/11/2006         11
## 3 2006       1     1     20355            US      US  1/11/2006         11
## 4 2006       1     1     20355            US      US  1/11/2006         11
## 5 2006       1     1     20355            US      US  1/11/2006         11
## 6 2006       1     1     20355            US      US  1/11/2006         11
##   DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1         3       1      1607  N813MA                96             90      77
## 2         3       1       135  N114UW                98            109      81
## 3         3       1       139  N747UW               108            111      83
## 4         3       1       559  N433US               111            105      85
## 5         3       1       190  N745UW                73             85      59
## 6         3       1       258  N420US                74             81      57
##   ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1        0        0           0           0        1    1651  1600-1659
## 2        0        0           0           0       -7    1722  1700-1759
## 3        1        0           1           0       16    2012  1900-1959
## 4        0        0           0           0        0     750  0700-0759
## 5        0        0           0           0      -13    2242  2200-2259
## 6        0        0           0           0       -4     857  0900-0959
##   CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1       1650        0        0           0           0       -5    1515
## 2       1729        0        0           0           0        4    1544
## 3       1956        1        0           1           0       19    1824
## 4        750        0        0           0           0       -6     559
## 5       2255        0        0           0           0       -1    2129
## 6        901        0        0           0           0        3     743
##   DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1  1500-1559       1520    BDL       Hartford          CT               9
## 2  1500-1559       1540    BOS         Boston          MA              25
## 3  1800-1859       1805    BOS         Boston          MA              25
## 4  0600-0659        605    BOS         Boston          MA              25
## 5  2100-2159       2130    CLT      Charlotte          NC              37
## 6  0700-0759        740    CLT      Charlotte          NC              37
##   OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1     Connecticut        11  PIT   Pittsburgh        PA            42
## 2   Massachusetts        13  PIT   Pittsburgh        PA            42
## 3   Massachusetts        13  PIT   Pittsburgh        PA            42
## 4   Massachusetts        13  PIT   Pittsburgh        PA            42
## 5  North Carolina        36  PIT   Pittsburgh        PA            42
## 6  North Carolina        36  PIT   Pittsburgh        PA            42
##   DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1  Pennsylvania      23      406             2      5      14      1529
## 2  Pennsylvania      23      496             2      3      14      1558
## 3  Pennsylvania      23      496             2      5      20      1844
## 4  Pennsylvania      23      496             2      5      21       620
## 5  Pennsylvania      23      366             2      4      10      2139
## 6  Pennsylvania      23      366             2      5      12       755
##   WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1     1646         0                         0            0            0
## 2     1719         0                         0            0            0
## 3     2007         0                         0            0            0
## 4      745         0                         0            0            0
## 5     2238         0                         0            0            0
## 6      852         0                         0            0            0
##   NASDelay SecurityDelay LateAircraftDelay
## 1        0             0                 0
## 2        0             0                 0
## 3        0             0                16
## 4        0             0                 0
## 5        0             0                 0
## 6        0             0                 0
delay.carrier <- all_PIT_2006_clean %>%
  group_by(Carrier) %>%
  summarize(count = n(),
            totalArrDelay = sum(ArrDel15 > 0),
            totalDepDelay = sum(DepDelay > 0),
            numCarrierDelays = sum(CarrierDelay > 0),
            numWeatherDelays = sum(WeatherDelay > 0),
            numNASDelays = sum(NASDelay > 0),
            numSecurityDelays = sum(SecurityDelay > 0),
            numLateAircraftDelays = sum(LateAircraftDelay > 0))

# proportion of delays of each type out of the total number of flights that got delayed (by greater than 0 minutes, by carrier etc.)
ggplot(data = delay.carrier,
       aes(x = as.factor(Carrier), y = numCarrierDelays/totalArrDelay)) + 
  geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggplot(data = all_PIT_2006_clean,
       aes(x = WheelsOn, y = ArrDelay)) +
  geom_point(stat = "identity")

# Checking which variables are important by plotting graphs and checking if there are any trends

ggplot(all_PIT_2006_clean,
       aes(x = as.factor(Month), 
           y = ArrDelay)) +
  geom_boxplot()

# Month also does not seem to have a significantly significant difference across the year.

ggplot(all_PIT_2006_clean,
       aes(x = as.factor(Month), 
           y = WeatherDelay)) +
  geom_boxplot()

# Month and WeatherDelay also seem to not be related (in a statistically significant way)

ggplot(all_PIT_2006_clean,
       aes(x = ArrDelay, 
           y = CarrierDelay)) +
  geom_point()

# ArrDelay and CarrierDelay clearly seem to be correlated. There is an upward trend that we can see
getMetrics <- function(table){
  TP <- table["1", "1"]
  FP <- table["1", "0"]
  TN <- table["0", "0"]
  FN <- table["0", "1"]
  
  return(TP)
  return(FP)
  return(TN)
  return(FN)
}

First, we read in both the datasets available to us. One that contains data about flights from 2006 and one that contains information about flights from both 2018 and 2019. Because of the sheer magnitude of the data, we decided to look at flights that arrived at Pittsburgh airport, and so we made a separate dataset that contained those rows for which the destination airport was PIT (Pittsburgh). We also decided to divide the training and testing dataset in a 80:20 ratio and used 20% of the total data (arrivals at pittsburgh airport) for testing and 80% for training.

Because the rows where our outcome or response variable (ArrDel15) was equal to 1, were quite low compared to the values where its value was equal to 0. So we decided to upsample our data by including 15000 such values to our original dataset so that the difference in number of rows having these two different values would decrease.

#all_2006 <- read.csv(file = "all_PIT_2006.csv", header = TRUE)
all_2018_2019 <- read.csv(file = "all_PIT_201803_201902.csv", header = TRUE)
all_2006 <- all_PIT_2006_clean


pit_idx <- which(all_2006$Dest == "PIT")
all_PIT_2006 <- all_2006[pit_idx,]

flights.more.idx <- sample(which(all_PIT_2006$ArrDel15 == "1"), 15000, replace = TRUE)

flights.upsample.2006 <- rbind(all_PIT_2006,
                            all_PIT_2006[flights.more.idx, ])
all_PIT_2006 <- flights.upsample.2006

# 2018 and 2019 data is only for testing. Not training
pit_idx_2018.2019 <- which(all_2018_2019$Dest == "PIT")
all_PIT_2018_2019 <- all_2006[pit_idx_2018.2019,]


# head(all_PIT_2006)
all_PIT_2006_clean <- na.omit(all_PIT_2006)
all_PIT_2018_2019_clean <- na.omit(all_PIT_2018_2019)

nrow(all_PIT_2006_clean)
## [1] 61666
nrow(all_PIT_2018_2019_clean)
## [1] 46580
test.indexes.2006 <- sample(1:nrow(all_PIT_2006_clean), 
                       round(0.2 * nrow(all_PIT_2006_clean)))
train.indexes.2006 <- setdiff(1:nrow(all_PIT_2006_clean), test.indexes.2006)

training.data.pit <- all_PIT_2006_clean[train.indexes.2006,]
testing.data.pit <- all_PIT_2006_clean[test.indexes.2006,]



test.indexes.2018.2019 <- sample(1:nrow(all_PIT_2018_2019_clean), 
                       round(0.2 * nrow(all_PIT_2018_2019_clean)))
train.indexes.2018.2019 <- setdiff(1:nrow(all_PIT_2018_2019_clean), test.indexes.2018.2019)

training.data.pit.2018.2019 <- all_PIT_2018_2019_clean[train.indexes.2018.2019,]
testing.data.pit.2018.2019 <- all_PIT_2018_2019_clean[test.indexes.2018.2019,]

We decided to build a model that would help people who come to pick up their relatives and friends from airports because we believe they suffer more than the travelers who are actually in a plane and can pass time using in-flight entertainment. But the people who come to pick them need to plan accordingly and arrive way before time. If these people could somehow know if the flight was being delayed, they would leave for the airport accordingly and do other important stuff. Since we’re talking about arrivals, we decided to use ArrDel15 (which is a categorical variable that has the value 1 when the flight is more than 15 mins late and 0 otherwise). Even though ArrDelay was another variable that could be used to predict delays, we did not move forward with it because we believed that smaller delays might not affect people that much and only when the delay was 15 mins or more did it start to become tiresome.

We also decided to discard some of the variables that we thought would not make sense to use as our predictor variables. These included variables like FlightNum, TailNum, AirlineID, Year (because it had the same value for all rows in the 2006 dataset) and even date etc. (should we mention all of them?)

We are going to make a vector of all the variables that we think are important in predicting the response variable and then use variable selection techniques taught in class to find the subset of variables that would actually be helpful for our model.

impVarVector <- c("Quarter", "Month", "Carrier",
                  "ActualElapsedTime", "AirTime", "Distance",
                  "ArrDel15", "DayOfWeek", "DayofMonth", "DepDelay", "WheelsOff")
impVarVecNumeric <- c("Quarter", "Month",
                  "ActualElapsedTime", "AirTime", "Distance",
                  "ArrDel15", "DayOfWeek", "DayofMonth", "DepDelay", "WheelsOff")
training.data.pit.subset <- training.data.pit[,impVarVecNumeric]
testing.data.pit.subset <- testing.data.pit[,impVarVecNumeric]
head(training.data.pit.subset)
##   Quarter Month ActualElapsedTime AirTime Distance ArrDel15 DayOfWeek
## 2       1     1                98      81      496        0         3
## 3       1     1               108      83      496        1         3
## 4       1     1               111      85      496        0         3
## 6       1     1                74      57      366        0         3
## 7       1     1                77      60      366        0         3
## 8       1     1                70      59      366        0         3
##   DayofMonth DepDelay WheelsOff
## 2         11        4      1558
## 3         11       19      1844
## 4         11       -6       620
## 6         11        3       755
## 7         11        1      1750
## 8         11        3      1555
# ArrDelayData = training.data.pit[,c(2, 3, 6, 10, 13, 15, 20, 44, 46,
                                     # 47, 48, 49)]

Now that we have made a new dataset with the relevant variables, we decided to check their correlation with each other so we could deal with the highly correlated variables accordingly. We decided to use a threshold of 0.8 and if the correlation came out to be greater than 0.8, we considered those two predictor variables to be highly correlated and decided to drop one of those two to avoid the effect of one variable creeping into the other and making the results statistically insignificant.

corr <- cor(training.data.pit.subset)
ggcorrplot(corr, type = "lower", lab = TRUE)

high.corr <- as.data.frame(as.table(corr))

high.corr <- high.corr[(high.corr$Freq > 0.8 | high.corr$Freq < -0.8) & 
            high.corr$Var1 != high.corr$Var2, ] 
high.corr <- na.omit(high.corr)
high.corr
##                 Var1              Var2      Freq
## 2              Month           Quarter 0.9704198
## 11           Quarter             Month 0.9704198
## 24           AirTime ActualElapsedTime 0.9517271
## 25          Distance ActualElapsedTime 0.9315724
## 33 ActualElapsedTime           AirTime 0.9517271
## 35          Distance           AirTime 0.9851318
## 43 ActualElapsedTime          Distance 0.9315724
## 44           AirTime          Distance 0.9851318

We can see from the table and correlation matrix above that there are a number of variables in the subset that we chose, that are highly correlated with each other (greater than 0.95!). We will now remove one of these variables and also add the Carrier variable that we removed earlier so that the correlation could be performed (can be done only on numeric variables)

varsToRemove <- c(1, 4, 5) # Quarter, ActualElapsedTime and AirTime
finalVars <- training.data.pit[impVarVector]
finalVarsTest <- testing.data.pit[impVarVector]
impVarVectorFinal <- finalVars[-varsToRemove]
impVarVectorTest <- finalVarsTest[-varsToRemove]
colnames(impVarVectorFinal)
## [1] "Month"      "Carrier"    "Distance"   "ArrDel15"   "DayOfWeek" 
## [6] "DayofMonth" "DepDelay"   "WheelsOff"
# head(impVarVectorFinal)

The variables mentioned above are the final list of variables that we have chosen after common sense and correlation methods. We will now apply methods learnt in class to further trim these down if that is possible.

glm_lexp= glm(ArrDel15 ~ .,data = impVarVectorFinal)
kable(coef(summary(glm_lexp)), digits = c(1,0,1,3))
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1 0 8.2 0.000
Month 0.0 0 10.9 0.000
CarrierCO 0.0 0 -2.1 0.032
CarrierDL 0.0 0 0.1 0.903
CarrierEV 0.1 0 5.9 0.000
CarrierFL 0.0 0 -2.4 0.017
CarrierMQ 0.1 0 5.2 0.000
CarrierNW 0.0 0 1.2 0.236
CarrierOH 0.0 0 -1.4 0.148
CarrierOO 0.0 0 -2.6 0.009
CarrierRU 0.1 0 7.8 0.000
CarrierUA 0.0 0 -1.8 0.080
CarrierUS 0.0 0 -2.7 0.008
CarrierWN 0.0 0 -3.8 0.000
CarrierXE 0.0 0 2.6 0.008
CarrierYV -0.1 0 -7.4 0.000
Distance 0.0 0 -8.5 0.000
DayOfWeek 0.0 0 -2.6 0.009
DayofMonth 0.0 0 1.3 0.210
DepDelay 0.0 0 142.4 0.000
WheelsOff 0.0 0 30.1 0.000
coeffs <- coef(summary(glm_lexp))
statistically.significant <- coef(summary(glm_lexp))[,"Pr(>|t|)"] < 0.05

stat.sig.vars <- names(which(statistically.significant == TRUE))
stat.sig.vars
##  [1] "(Intercept)" "Month"       "CarrierCO"   "CarrierEV"   "CarrierFL"  
##  [6] "CarrierMQ"   "CarrierOO"   "CarrierRU"   "CarrierUS"   "CarrierWN"  
## [11] "CarrierXE"   "CarrierYV"   "Distance"    "DayOfWeek"   "DepDelay"   
## [16] "WheelsOff"

We see that three of the Carriers (namely DL, NW and UA) dont have highly statistically significant coefficients so we decided to apply feature engineering to make categorical variables (new columns) for the remaining carriers. Instead of giving each of these Carriers a numeric value we decided to make a new column for each statistically significant Carrier by assigning 1 if that row belonged to that particular Carrier and 0 otherwise. We did this for all Carriers that had statistically significant coefficients.

newImpVarVectorFinal <- impVarVectorFinal
newImpVarVectorFinalTest <- impVarVectorTest

newImpVarVectorFinal$CarrierCO <- ifelse(newImpVarVectorFinal$Carrier == "CO", 1, 0)
newImpVarVectorFinal$CarrierEV <- ifelse(newImpVarVectorFinal$Carrier == "EV", 1, 0)
newImpVarVectorFinal$CarrierFL <- ifelse(newImpVarVectorFinal$Carrier == "FL", 1, 0)
newImpVarVectorFinal$CarrierMQ <- ifelse(newImpVarVectorFinal$Carrier == "MQ", 1, 0)
newImpVarVectorFinal$CarrierNW <- ifelse(newImpVarVectorFinal$Carrier == "NW", 1, 0)
newImpVarVectorFinal$CarrierOH <- ifelse(newImpVarVectorFinal$Carrier == "OH", 1, 0)
newImpVarVectorFinal$CarrierOO <- ifelse(newImpVarVectorFinal$Carrier == "OO", 1, 0)
newImpVarVectorFinal$CarrierRU <- ifelse(newImpVarVectorFinal$Carrier == "RU", 1, 0)
newImpVarVectorFinal$CarrierUS <- ifelse(newImpVarVectorFinal$Carrier == "US", 1, 0)
newImpVarVectorFinal$CarrierWN <- ifelse(newImpVarVectorFinal$Carrier == "WN", 1, 0)
newImpVarVectorFinal$CarrierXE <- ifelse(newImpVarVectorFinal$Carrier == "XE", 1, 0)


newImpVarVectorFinalTest$CarrierCO <- ifelse(newImpVarVectorFinalTest$Carrier == "CO", 1, 0)
newImpVarVectorFinalTest$CarrierEV <- ifelse(newImpVarVectorFinalTest$Carrier == "EV", 1, 0)
newImpVarVectorFinalTest$CarrierFL <- ifelse(newImpVarVectorFinalTest$Carrier == "FL", 1, 0)
newImpVarVectorFinalTest$CarrierMQ <- ifelse(newImpVarVectorFinalTest$Carrier == "MQ", 1, 0)
newImpVarVectorFinalTest$CarrierNW <- ifelse(newImpVarVectorFinalTest$Carrier == "NW", 1, 0)
newImpVarVectorFinalTest$CarrierOH <- ifelse(newImpVarVectorFinalTest$Carrier == "OH", 1, 0)
newImpVarVectorFinalTest$CarrierOO <- ifelse(newImpVarVectorFinalTest$Carrier == "OO", 1, 0)
newImpVarVectorFinalTest$CarrierRU <- ifelse(newImpVarVectorFinalTest$Carrier == "RU", 1, 0)
newImpVarVectorFinalTest$CarrierUS <- ifelse(newImpVarVectorFinalTest$Carrier == "US", 1, 0)
newImpVarVectorFinalTest$CarrierWN <- ifelse(newImpVarVectorFinalTest$Carrier == "WN", 1, 0)
newImpVarVectorFinalTest$CarrierXE <- ifelse(newImpVarVectorFinalTest$Carrier == "XE", 1, 0)

We use all of these features and feed them into the forward stepwise model as well as backward stepwise model to see if it will further help trim down the variables and if they return the same results. As we can see, they both return the same variables except for two Carriers that are different. The other variables like Month, TaxiIn, Distance and DepDelay are selected by both of them. We did not do a best subsets selection because of the magnitude of our data. An exhaustive search would have taken a lot of time and so we decided to move forward with forward and backward stepwise models and saw that the results were very similar.

flights.subset.bw <- regsubsets(ArrDel15 ~ .,
              data = newImpVarVectorFinal,
               nbest = 1,    # 1 best model for each number of predictors
               nvmax = NULL,    # NULL for no limit on number of variables
               method = "backward", really.big = TRUE)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 20
## Warning in rval$lopt[] <- rval$vorder[rval$lopt]: number of items to replace is
## not a multiple of replacement length
summary(flights.subset.bw)
## Subset selection object
## Call: regsubsets.formula(ArrDel15 ~ ., data = newImpVarVectorFinal, 
##     nbest = 1, nvmax = NULL, method = "backward", really.big = TRUE)
## 31 Variables  (and intercept)
##            Forced in Forced out
## Month          FALSE      FALSE
## CarrierCO      FALSE      FALSE
## CarrierDL      FALSE      FALSE
## CarrierEV      FALSE      FALSE
## CarrierFL      FALSE      FALSE
## CarrierMQ      FALSE      FALSE
## CarrierNW      FALSE      FALSE
## CarrierOH      FALSE      FALSE
## CarrierOO      FALSE      FALSE
## CarrierRU      FALSE      FALSE
## CarrierUA      FALSE      FALSE
## CarrierUS      FALSE      FALSE
## CarrierWN      FALSE      FALSE
## CarrierXE      FALSE      FALSE
## CarrierYV      FALSE      FALSE
## Distance       FALSE      FALSE
## DayOfWeek      FALSE      FALSE
## DayofMonth     FALSE      FALSE
## DepDelay       FALSE      FALSE
## WheelsOff      FALSE      FALSE
## CarrierCO      FALSE      FALSE
## CarrierEV      FALSE      FALSE
## CarrierFL      FALSE      FALSE
## CarrierMQ      FALSE      FALSE
## CarrierNW      FALSE      FALSE
## CarrierOH      FALSE      FALSE
## CarrierOO      FALSE      FALSE
## CarrierRU      FALSE      FALSE
## CarrierUS      FALSE      FALSE
## CarrierWN      FALSE      FALSE
## CarrierXE      FALSE      FALSE
## 1 subsets of each size up to 20
## Selection Algorithm: backward
##           Month CarrierCO CarrierDL CarrierEV CarrierFL CarrierMQ CarrierNW
## 1  ( 1 )  " "   " "       " "       " "       " "       " "       " "      
## 2  ( 1 )  " "   " "       " "       " "       " "       " "       " "      
## 3  ( 1 )  " "   " "       " "       " "       " "       "*"       " "      
## 4  ( 1 )  " "   " "       " "       " "       " "       "*"       " "      
## 5  ( 1 )  "*"   " "       " "       " "       " "       "*"       " "      
## 6  ( 1 )  "*"   " "       " "       " "       " "       "*"       " "      
## 7  ( 1 )  "*"   " "       " "       " "       " "       "*"       " "      
## 8  ( 1 )  "*"   " "       " "       "*"       " "       "*"       " "      
## 9  ( 1 )  "*"   " "       " "       "*"       " "       "*"       " "      
## 10  ( 1 ) "*"   " "       " "       "*"       " "       "*"       " "      
## 11  ( 1 ) "*"   " "       " "       "*"       " "       "*"       " "      
## 12  ( 1 ) "*"   " "       " "       "*"       " "       "*"       " "      
## 13  ( 1 ) "*"   " "       " "       "*"       "*"       "*"       " "      
## 14  ( 1 ) "*"   " "       " "       "*"       "*"       "*"       " "      
## 15  ( 1 ) "*"   "*"       " "       "*"       "*"       "*"       " "      
## 16  ( 1 ) "*"   "*"       " "       "*"       "*"       "*"       " "      
## 17  ( 1 ) "*"   "*"       " "       "*"       "*"       "*"       " "      
## 18  ( 1 ) "*"   "*"       " "       "*"       "*"       "*"       "*"      
## 19  ( 1 ) "*"   "*"       " "       "*"       "*"       "*"       "*"      
## 20  ( 1 ) "*"   "*"       "*"       "*"       "*"       "*"       "*"      
##           CarrierOH CarrierOO CarrierRU CarrierUA CarrierUS CarrierWN CarrierXE
## 1  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 2  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 3  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 4  ( 1 )  " "       " "       "*"       " "       " "       " "       " "      
## 5  ( 1 )  " "       " "       "*"       " "       " "       " "       " "      
## 6  ( 1 )  " "       " "       "*"       " "       " "       " "       " "      
## 7  ( 1 )  " "       " "       "*"       " "       " "       " "       " "      
## 8  ( 1 )  " "       " "       "*"       " "       " "       " "       " "      
## 9  ( 1 )  " "       " "       "*"       " "       " "       " "       "*"      
## 10  ( 1 ) " "       " "       "*"       " "       " "       "*"       "*"      
## 11  ( 1 ) " "       " "       "*"       " "       "*"       "*"       "*"      
## 12  ( 1 ) " "       "*"       "*"       " "       "*"       "*"       "*"      
## 13  ( 1 ) " "       "*"       "*"       " "       "*"       "*"       "*"      
## 14  ( 1 ) " "       "*"       "*"       "*"       "*"       "*"       "*"      
## 15  ( 1 ) " "       "*"       "*"       "*"       "*"       "*"       "*"      
## 16  ( 1 ) "*"       "*"       "*"       "*"       "*"       "*"       "*"      
## 17  ( 1 ) "*"       "*"       "*"       "*"       "*"       "*"       "*"      
## 18  ( 1 ) "*"       "*"       "*"       "*"       "*"       "*"       "*"      
## 19  ( 1 ) "*"       "*"       "*"       "*"       "*"       "*"       "*"      
## 20  ( 1 ) "*"       "*"       "*"       "*"       "*"       "*"       "*"      
##           CarrierYV Distance DayOfWeek DayofMonth DepDelay WheelsOff CarrierCO
## 1  ( 1 )  " "       " "      " "       " "        "*"      " "       " "      
## 2  ( 1 )  " "       " "      " "       " "        "*"      "*"       " "      
## 3  ( 1 )  " "       " "      " "       " "        "*"      "*"       " "      
## 4  ( 1 )  " "       " "      " "       " "        "*"      "*"       " "      
## 5  ( 1 )  " "       " "      " "       " "        "*"      "*"       " "      
## 6  ( 1 )  " "       "*"      " "       " "        "*"      "*"       " "      
## 7  ( 1 )  "*"       "*"      " "       " "        "*"      "*"       " "      
## 8  ( 1 )  "*"       "*"      " "       " "        "*"      "*"       " "      
## 9  ( 1 )  "*"       "*"      " "       " "        "*"      "*"       " "      
## 10  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 11  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 12  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 13  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 14  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 15  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 16  ( 1 ) "*"       "*"      " "       " "        "*"      "*"       " "      
## 17  ( 1 ) "*"       "*"      "*"       " "        "*"      "*"       " "      
## 18  ( 1 ) "*"       "*"      "*"       " "        "*"      "*"       " "      
## 19  ( 1 ) "*"       "*"      "*"       "*"        "*"      "*"       " "      
## 20  ( 1 ) "*"       "*"      "*"       "*"        "*"      "*"       " "      
##           CarrierEV CarrierFL CarrierMQ CarrierNW CarrierOH CarrierOO CarrierRU
## 1  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 2  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 3  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 4  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 5  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 6  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 7  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 8  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 9  ( 1 )  " "       " "       " "       " "       " "       " "       " "      
## 10  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 11  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 12  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 13  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 14  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 15  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 16  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 17  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 18  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 19  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
## 20  ( 1 ) " "       " "       " "       " "       " "       " "       " "      
##           CarrierUS CarrierWN CarrierXE
## 1  ( 1 )  " "       " "       " "      
## 2  ( 1 )  " "       " "       " "      
## 3  ( 1 )  " "       " "       " "      
## 4  ( 1 )  " "       " "       " "      
## 5  ( 1 )  " "       " "       " "      
## 6  ( 1 )  " "       " "       " "      
## 7  ( 1 )  " "       " "       " "      
## 8  ( 1 )  " "       " "       " "      
## 9  ( 1 )  " "       " "       " "      
## 10  ( 1 ) " "       " "       " "      
## 11  ( 1 ) " "       " "       " "      
## 12  ( 1 ) " "       " "       " "      
## 13  ( 1 ) " "       " "       " "      
## 14  ( 1 ) " "       " "       " "      
## 15  ( 1 ) " "       " "       " "      
## 16  ( 1 ) " "       " "       " "      
## 17  ( 1 ) " "       " "       " "      
## 18  ( 1 ) " "       " "       " "      
## 19  ( 1 ) " "       " "       " "      
## 20  ( 1 ) " "       " "       " "
flights.subset <- regsubsets(ArrDel15 ~ .,
              data = newImpVarVectorFinal,
               nbest = 1,    # 1 best model for each number of predictors
               nvmax = NULL,    # NULL for no limit on number of variables
               method = "forward", really.big = TRUE)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 20
## Warning in rval$lopt[] <- rval$vorder[rval$lopt]: number of items to replace is
## not a multiple of replacement length
print("Coefficients selected by backward stepwise model: ")
## [1] "Coefficients selected by backward stepwise model: "
names(coef(flights.subset.bw, which.min(summary(flights.subset.bw)$bic)))
##  [1] "(Intercept)" "Month"       "CarrierEV"   "CarrierMQ"   "CarrierRU"  
##  [6] "CarrierUS"   "CarrierWN"   "CarrierXE"   "CarrierYV"   "Distance"   
## [11] "DepDelay"    "WheelsOff"
print("Coefficients selected by forward stepwise model: ")
## [1] "Coefficients selected by forward stepwise model: "
names(coef(flights.subset, which.min(summary(flights.subset)$bic)))
##  [1] "(Intercept)" "Month"       "CarrierDL"   "CarrierMQ"   "CarrierNW"  
##  [6] "CarrierRU"   "CarrierWN"   "CarrierYV"   "Distance"    "DepDelay"   
## [11] "WheelsOff"   "CarrierEV"   "CarrierXE"

We plotted RSS (Residual sum of squares), R-squared, BIC and AIC with the number of variables to see how each of these metrics changed with the number of variables, and we were not surprised to see that RSS decreased as more and more variables were included. Similarly, R-squared continued to increase as more and more variables were included because more of the variation in the response variable (ArrDel15) was being explained by the model. The AIC curve slightly surprised us because its lowest value came out to be at 19 variables but this was probably because AIC does not penalize an increase in the number of variables to a great extent. We see in the BIC curve that the model selected a total of 10 variables which is good because BIC penalizes an increase in the number of variables to a greater extent and thus makes the model simpler and less prone to overfitting. This is why we decided to move forward with minimizing the Bayesian Information Classification (BIC) for our variable selection.

flights.summary<-summary(flights.subset)
num_variables<-seq(1,length(flights.summary$rss))

plot_RSS<-ggplot(data = data.frame(flights.summary$rss),
                 aes(x=num_variables,y=flights.summary$rss))+
  geom_line()+
  geom_point(x=which.min(flights.summary$rss),
             y=min(flights.summary$rss),aes(color="red"),
             show.legend = FALSE)+
  xlab("# Variables")+
  ylab("RSS")+
  theme_bw()

plot_R_sq<-ggplot(data = data.frame(flights.summary$rsq),
                 aes(x=num_variables,y=flights.summary.rsq))+
  geom_line()+
  geom_point(x=which.max(flights.summary$rsq),
             y=max(flights.summary$rsq),aes(color="red"),
             show.legend = FALSE)+
  xlab("# Variables")+
  ylab("R-sq")+
  theme_bw()

plot_BIC<-ggplot(data = data.frame(flights.summary$bic),
                 aes(x=num_variables,y=flights.summary.bic))+
  geom_line()+
  geom_point(x=which.min(flights.summary$bic),
             y=min(flights.summary$bic),aes(color="red"),
             show.legend = FALSE)+
  xlab("# Variables")+
  ylab("BIC")+
  theme_bw()

plot_AIC<-ggplot(data = data.frame(flights.summary$cp),
                 aes(x=num_variables,y=flights.summary.cp))+
  geom_line()+
  geom_point(x=which.min(flights.summary$cp),
             y=min(flights.summary$cp),aes(color="red"),
             show.legend = FALSE)+
  xlab("# Variables")+
  ylab("AIC")+
  theme_bw()


grid.arrange(plot_RSS, plot_R_sq,plot_AIC,plot_BIC, ncol=2,nrow=2)

Next we visualize the variables that have minimized the BIC. We only did this for the backward stepwise model because both backward and forward came out to be similar and backward stepwise had more variables so we wanted to include a larger subset of variables.

plot(flights.subset.bw,scale="bic")

We will now run the analysis of variable selection using the lasso

# Using the lasso now to do variable selection

xs<-model.matrix(ArrDel15 ~.,newImpVarVectorFinal)[,-1]
y<-newImpVarVectorFinal$ArrDel15

#alpha=1 is a lasso penalty!
flights.lasso<-glmnet(xs,y,alpha=1)
coef(flights.lasso,s=0.1) # can change this lambda value
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) 0.336067832
## Month       .          
## CarrierCO   .          
## CarrierDL   .          
## CarrierEV   .          
## CarrierFL   .          
## CarrierMQ   .          
## CarrierNW   .          
## CarrierOH   .          
## CarrierOO   .          
## CarrierRU   .          
## CarrierUA   .          
## CarrierUS   .          
## CarrierWN   .          
## CarrierXE   .          
## CarrierYV   .          
## Distance    .          
## DayOfWeek   .          
## DayofMonth  .          
## DepDelay    0.004414527
## WheelsOff   .          
## CarrierCO   .          
## CarrierEV   .          
## CarrierFL   .          
## CarrierMQ   .          
## CarrierNW   .          
## CarrierOH   .          
## CarrierOO   .          
## CarrierRU   .          
## CarrierUS   .          
## CarrierWN   .          
## CarrierXE   .
plot(flights.lasso, label = TRUE)

The model becomes more complex as we move from left to right i.e. increase the number of variables in the model.

cv.out=cv.glmnet(xs,y,alpha=1)
plot(cv.out)

We use the 1 se lambda value to get the coefficients that are not 0 at that value. Although we decided to use multiple approaches to solving the problem of variable selection, we are most confident about the results from LASSO (the coefficients are a subset of those from backward stepwise selection except for a few), because LASSO uses a tuning parameter to penalize the number of parameters and this prevents overfitting. And we use cross-validation to find the 1 se lambda value to get the coefficients which makes us feel more confident about LASSO’s selection of the coefficients. Some coefficients were not included, like CarrierYV because they did not come out to be statistically significant when we tested them initially.

one.se.lambda <- cv.out$lambda.1se
lasso.coef = predict(flights.lasso, type ="coefficients", s = one.se.lambda)[1:21,]
# lasso.coef[lasso.coef != 0]
one.se.lambda.coeffs <- names(lasso.coef[lasso.coef != 0])[names(lasso.coef[lasso.coef != 0]) != "(Intercept)"]
one.se.lambda.coeffs
##  [1] "Month"     "CarrierEV" "CarrierMQ" "CarrierRU" "CarrierUS" "CarrierWN"
##  [7] "CarrierXE" "CarrierYV" "Distance"  "DepDelay"  "WheelsOff"

Now that we have the relevant coefficients, we train a model that will help us predict whether there will be a 15 min delay or not. This will essentially help the people who will be coming to pick up their relatives from the airport. We plan on training multiple models and see which one performs the best on the test dataset. We first train a decision tree. It would help us decide the path of variable values that lead to an arrival delay of greater than 15 minutes (1) or not (0).

columns.to.select <- c(1, 10, 12, 16, 13, 17, 18, 19, 3, 7, 8, 4)

newImpVarVectorFinal <- newImpVarVectorFinal[,columns.to.select] # UNCOMMENT LINE!!!
newImpVarVectorFinalTest <- newImpVarVectorFinalTest[,columns.to.select] # UNCOMMENT LINE FOR FINAL RUN

flight.tree <- rpart(formula = ArrDel15 ~ ., data=newImpVarVectorFinal,
                      control = rpart.control(minsplit=100, cp=0.002))
fltree <- as.party(flight.tree)
plot(fltree, gp = gpar(fontsize = 8))

plotcp(flight.tree)

min.cv.idx <- which.min(flight.tree$cptable[,"xerror"])
# min CV error + 1se (this is the height of the horizontal bar)
min.cv.err.1se <- flight.tree$cptable[min.cv.idx,"xerror"] +
                    flight.tree$cptable[min.cv.idx,"xstd"]
# Which cp values produce models whose error is below min CV + 1se?
candidate.cp.vals <- flight.tree$cptable[which(flight.tree$cptable[,"xerror"] < min.cv.err.1se),"CP"]
# 1-SE rule value of cp
cp.1se <- max(candidate.cp.vals)
print("1 se value of cp that we use for pruning: ")
## [1] "1 se value of cp that we use for pruning: "
print(round(cp.1se,5))
## [1] 0.00212
kable(flight.tree$cptable, digits=5, format="markdown")
CP nsplit rel error xerror xstd
0.52363 0 1.00000 1.00003 0.00144
0.02481 1 0.47637 0.47728 0.00455
0.02032 2 0.45157 0.45172 0.00422
0.00365 3 0.43124 0.43248 0.00406
0.00286 4 0.42759 0.42319 0.00396
0.00212 7 0.41862 0.41913 0.00392
0.00200 8 0.41650 0.41804 0.00388

The tree above gives several important insights. Firstly, we can see that there are 15 terminal nodes. Next we can map out many different paths. If the departure delay is greater than or equal to 14.5 minutes, we see that we can solely use the departure delay variable to predict if there will be an arrival delay of greater than 15 minutes or not. This makes sense, because if the flight departed later than the actual time then it would probably land later than the set arrival time. What was interesting to note was that there were cases when the flight departed more than 15 minutes late and still did not get classified as being an arrival delay of more than 15 minutes (when departure delay is greater than 21.5 minutes) Secondly, if the departure delay was between 2.5 minutes and 14.5 minutes, then again Departure delay in minutes was the single most important variable to decide if there would be a delay of 15 minutes or more. If teh departure delay was less than 2.5 minutes, then there were a number of predictors like distance and wheels off time that determined whether the flight would arrive 15 minutes late or not.

We will now Prune the tree to see if the prediction error stays almost the same with a less complex tree, using the cp value that we found in the previous part. Pruning helps us decide which path is the most important and prevents overfitting of the data because it “prunes” branches that are not very necessary.

flight.pruned <- prune(flight.tree, cp = cp.1se)

predict.pruned.flights <- predict(flight.pruned, newdata=newImpVarVectorFinalTest)

cutoff <- 0.50
predictedVals <- ifelse(predict.pruned.flights > cutoff, 1, 0)
conf.table <- table(preds = predictedVals, actual = newImpVarVectorFinalTest$ArrDel15)

# metrics <- getMetrics(conf.table)

misClassRate = mean(predictedVals != newImpVarVectorFinalTest$ArrDel15)
misClassRate
## [1] 0.1287602
TP <- conf.table["1", "1"]
FP <- conf.table["1", "0"]
TN <- conf.table["0", "0"]
FN <- conf.table["0", "1"]

The pruned tree gave a misclassification rate of 0.13 which turns out to be quite good. But what will be more informational is the proportion of values that we need. For the population that we have in mind (people who are coming to pick up their relatives from the airport), it is most important that our model has a high True positive rate or Sensitivity (so that they are able to correctly predict when there might be a delay and there actually is), a high true negative rate (to predict correctly when there might not be a delay and there isnt). Another very important metric is to minimize as much as possible, the number of cases when the person predicts there would not be an arrival delay but there is because this will lead to further inconvenience for the person. This is the false negative rate.

spec.table <- as.data.frame(cbind("Pruned tree: ", 
                                  "Sensitivity " = round(TP/(TP+FN),2),
                                  "Specificity " = round(TN/(TN+FP),2),
                                  "False negative rate " = round(FN/(FN+TP),2),
                                  "Accuracy " = 1-round(misClassRate,2),
                                  "Misclassification rate " = round(misClassRate,2)))

The table above shows the different metrics for the Pruned tree. We see that it performs fairly well on the given data set.

We now train an LDA model to see if LDA would be able to differentiate between the binary categorical variable of Arrivals being greater than 15 minutes or not. The reason we can use LDA is because the response variable is binary and LDA basically uses a straight line to discriminate between the two values for the response variable. Hence we use this too.

flight.delay.15.lda <- lda(ArrDel15  ~ ., data = newImpVarVectorFinal)
# plot(flight.delay.15.lda, col = as.numeric(newImpVarVectorFinal$ArrDel15)) # what does this plot tell us. Iris dataset wala homework 4.

confusion.lda <- table(predict(flight.delay.15.lda)$class,
                       newImpVarVectorFinal$ArrDel15)
TP.lda <- confusion.lda["1", "1"]
FP.lda <- confusion.lda["1", "0"]
TN.lda <- confusion.lda["0", "0"]
FN.lda <- confusion.lda["0", "1"]

spec.table.lda <- as.data.frame(cbind("LDA: ", 
                                  "Sensitivity " = round(TP.lda/(TP.lda+FN.lda),2),
                                  "Specificity " = round(TN.lda/(TN.lda+FP.lda),2),
                                  "False negative rate " = round(FN.lda/(FN.lda+TP.lda),2),
                                  "Accuracy " = round(sum(diag(confusion.lda)) / sum(confusion.lda),2),
                                  "Misclassification rate " = round(1 - sum(diag(confusion.lda)) / sum(confusion.lda),2)))

comb.table <- rbind(spec.table, spec.table.lda)
comb.table
##              V1 Sensitivity  Specificity  False negative rate  Accuracy 
## 1 Pruned tree:          0.76         0.95                 0.24      0.87
## 2         LDA:          0.56         0.99                 0.44      0.81
##   Misclassification rate 
## 1                    0.13
## 2                    0.19

We can see that although LDA performs slighty better for specificity, it does so at the expense of a large amount of sensitivity and false negative rate (both of which are important for us). Hence out of these two, we decide that a Pruned Tree is better than LDA because it helps us maximize or minimize metrics that are actually important to us. (Like maximize the sensitivity and specificity and minimize the false negative rate)

We now train a random forest model on the given data from 2006. A random forest is an ensemble learning method that we are using for our classification task. With random forests, we’re averaging over a bunch of bagged trees, and each tree is built by considering a small random subset of predictor variables at each split. This leads to a model that is very difficult to interpret. Random forests are very flexible and do not overfit the data. A random forest leads to a tree with high variance and low bias.

So we use the variable importance plot to understand what they are saying. In our case, the variable importance plot suggests that firstly, all of the different carrier variables are not at all important. Removing them does not affect the node purity at all. The node impurity is a measure of the homogeneity of the labels at the node. And we could see this with the tree that we showed earlier because Carrier was not present at any of the nodes. What we do see is that DepDelay (Departure delay in minutes) is the most important variable and removing this variable has the most effect on node purity. This is followed by the variables WheelsOff, Distance and Month.

# Random forests
classification.data.flights.train = newImpVarVectorFinal#[1:10000,] # comment this line for final analysis. Was taking too much time to run with the original dataset.

classification.data.flights.test = newImpVarVectorFinalTest
#newImpVarVectorFinal[6000:8000,] #comment this line too. Used this for testing

flights.rf <- randomForest(ArrDel15 ~ .,
                           data = classification.data.flights.train,
                           type = "response")
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
var.imp <- varImpPlot(flights.rf) # To find the most important variables

most.imp.vars <- rownames(var.imp)[order(var.imp, decreasing = TRUE)]

rf.test.prob <- predict(flights.rf, newdata = classification.data.flights.test, type = "response")
rf.test.prob.c = ifelse(rf.test.prob > 0.5, 1, 0)

table(pred=rf.test.prob.c,actual=classification.data.flights.test$ArrDel15)
##     actual
## pred    0    1
##    0 6875  909
##    1  301 4248
roc.rf <- roc(classification.data.flights.test$ArrDel15, rf.test.prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.rf)

roc.rf$auc
## Area under the curve: 0.9713

We now train a boosted tree model

# Boosted trees and ROC curves
# classification.data.flights.train$Carrier = as.factor(classification.data.flights.train$Carrier)
# classification.data.flights.train$OriginCityName = as.factor(classification.data.flights.train$OriginCityName)
flights.boost=gbm(ArrDel15 ~ ., data = classification.data.flights.train, distribution = "bernoulli", shrinkage = 0.1,
                 n.trees=10000, interaction.depth=4, cv.folds = 5)

best.ntrees = gbm.perf(flights.boost, method = "cv")

pred.boost = predict(flights.boost, newdata=classification.data.flights.test, n.trees=best.ntrees, type="response")
roc.boost <- roc(classification.data.flights.test$ArrDel15, pred.boost) 
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.boost, col = "red")
plot(roc.rf, col = "blue", add = TRUE)

roc.boost$auc
## Area under the curve: 0.9769
# Plotting the cv error with the number of trees on the x-axis
qplot(1:10000, flights.boost$cv.error, xlab = "Number of trees")

As CV decreases,

# K-means
classification.data.flights.train <- classification.data.flights.train[,-2]
classification.data.flights.train <- classification.data.flights.train[,-8:-22]
flights.scaled<-scale(classification.data.flights.train)
km.2 <- kmeans(flights.scaled, 2, nstart=20)
km.3 <- kmeans(flights.scaled, 3, nstart=20)
km.4 <- kmeans(flights.scaled, 4, nstart=20)
km.5 <- kmeans(flights.scaled, 5, nstart=20)

fviz_cluster(km.2, data=flights.scaled)